/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.util.*; import net.nutch.io.*; /********************************************************** * DBSectionReader reads a discrete portion of a WebDB. * It may implement its methods with either a local * MapFile.Reader object or (eventually) a remote- * machine network interface. For the moment, we * do only the MapFile.Reader implementation (much of * the code for this was moved from the earlier * pre-distributed version of WebDBReadaer). * * @author Mike Cafarella ***********************************************/ public class DBSectionReader { File sectionFile; WritableComparator comparator; MapFile.Reader reader; /** * Right now we assume we're getting a File that is a * MapFile.Reader directory. But in the future we could * also check for existence of a "remote-network" file, similar * to the way we do now for distributed index reading. * Then, we would either create a MapFile.Reader or a network * client for one. */ public DBSectionReader(File sectionFile, WritableComparator comparator) throws IOException { this.sectionFile = sectionFile; this.comparator = comparator; this.reader = new MapFile.Reader(sectionFile.getPath(), comparator); } /** * Fetch a Page with the given URL, and fill it into * the pre-allocated Page 'p'. */ public Page getPage(UTF8 url, Page p) throws IOException { return (Page) reader.get(url, p); } /** * Get Pages from the db according to their * content hash. */ public Vector getPages(MD5Hash md5) throws IOException { Vector records = new Vector(3); Page p = new Page(); p.getMD5().set(md5); reader.seek(p); while (reader.next(p, NullWritable.get())) { if (p.getMD5().compareTo(md5) == 0) { records.add(p); p = new Page(); } else { break; } } return records; } /** * Test whether a certain piece of content is in the * db, but don't bother returning it. */ public boolean pageExists(MD5Hash md5) throws IOException { Page p = new Page(); p.getMD5().set(md5); reader.seek(p); if (reader.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) { return true; } else { return false; } } /** * Iterate through all the Pages, sorted by URL */ public Enumeration pages() throws IOException { return new TableEnumerator(new MapFile.Reader(sectionFile.getPath(), comparator)); } // // The TableEnumerator goes through all the entries // in the Table (which is a MapFile). // class TableEnumerator implements Enumeration { MapFile.Reader reader; Page nextItem; /** * Start the cursor and find the first item. * Store it for later return. */ public TableEnumerator(MapFile.Reader reader) { this.reader = reader; this.nextItem = new Page(); try { if (! reader.next(new UTF8(), this.nextItem)) { this.nextItem = null; } } catch (IOException ie) { ie.printStackTrace(); this.nextItem = null; } } /** * If there's no item left in store, we've hit the end. */ public boolean hasMoreElements() { return (nextItem != null); } /** * Set aside the item we have in store. Then retrieve * another for the next time we're called. Finally, return * the set-aside item. */ public Object nextElement() { if (nextItem == null) { throw new NoSuchElementException("PageDB Enumeration"); } Page toReturn = nextItem; this.nextItem = new Page(); try { if (! reader.next(new UTF8(), nextItem)) { this.nextItem = null; } } catch (IOException ie) { this.nextItem = null; } return toReturn; } } /** * Iterate through all the Pages, sorted by MD5 */ public Enumeration pagesByMD5() throws IOException { return new IndexEnumerator(new SetFile.Reader(sectionFile.getPath(), comparator)); } // // The IndexEnumerator goes through all the entries // in the index (which is a SequenceFile). // class IndexEnumerator implements Enumeration { SetFile.Reader reader; Page nextItem; /** * Start the cursor and find the first item. * Store it for later return. */ public IndexEnumerator(SetFile.Reader reader) { this.reader = reader; this.nextItem = new Page(); try { if (! reader.next(nextItem)) { this.nextItem = null; } } catch (IOException ie) { this.nextItem = null; } } /** * If there's no item left in store, we've hit the end. */ public boolean hasMoreElements() { return (nextItem != null); } /** * Set aside the item we have in store. Then retrieve * another for the next time we're called. Finally, return * the set-aside item. */ public Object nextElement() { if (nextItem == null) { throw new NoSuchElementException("PageDB Enumeration"); } Page toReturn = nextItem; this.nextItem = new Page(); try { if (! reader.next(nextItem)) { this.nextItem = null; } } catch (IOException ie) { this.nextItem = null; } return toReturn; } } /** * Get all the hyperlinks that link TO the indicated URL. */ public Vector getLinks(UTF8 url) throws IOException { Vector records = new Vector(3); Link l = new Link(); l.getURL().set(url); reader.seek(l); while (reader.next(l, NullWritable.get())) { if (url.equals(l.getURL())) { records.add(l); l = new Link(); } else { break; } } return records; } /** * Grab all the links from the given MD5 hash. */ public Vector getLinks(MD5Hash md5) throws IOException { Vector records = new Vector(3); Link l = new Link(); l.getFromID().set(md5); reader.seek(l); while (reader.next(l, NullWritable.get())) { if (md5.equals(l.getFromID())) { records.add(l); l = new Link(); } else { break; } } return records; } /** * Return all the links, by target URL */ public Enumeration links() throws IOException { return new MapEnumerator(new MapFile.Reader(sectionFile.getPath(), comparator)); } // // Here's the class for the above function // class MapEnumerator implements Enumeration { MapFile.Reader reader; Link nextItem; /** * Start the cursor and find the first item. * Store it for later return. */ public MapEnumerator(MapFile.Reader reader) { this.reader = reader; this.nextItem = new Link(); try { if (! reader.next(this.nextItem, NullWritable.get())) { this.nextItem = null; } } catch (IOException ie) { this.nextItem = null; } } /** * If there's no item left in store, we've hit the end. */ public boolean hasMoreElements() { return (nextItem != null); } /** * Set aside the item we have in store. Then retrieve * another for the next time we're called. Finally, return * the set-aside item. */ public Object nextElement() { if (nextItem == null) { throw new NoSuchElementException("PageDB Enumeration"); } Link toReturn = nextItem; this.nextItem = new Link(); try { if (! reader.next(nextItem, NullWritable.get())) { this.nextItem = null; } } catch (IOException ie) { this.nextItem = null; } return toReturn; } } /** */ public void close() throws IOException { reader.close(); } }